library(webshot)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
library(leaflet)
setwd("/Users/samuellu/Desktop/PM566/GitHub/pm566-fall2022-labs_Sam/assignments/assignment01/")
data_2004 <- data.table::fread("ad_viz_plotval_data_2004.csv")
data_2019 <- data.table::fread("ad_viz_plotval_data_2019.csv")
str(data_2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 11 12.2 16.5 19.5 11.5 32.5 15.5 29.9 21 15.7 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 46 51 60 67 48 94 58 88 70 59 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : int 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88502 88502 88502 88502 88502 88502 88502 88502 88502 88101 ...
## $ AQS_PARAMETER_DESC : chr "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(data_2019)
## Classes 'data.table' and 'data.frame': 53156 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
#table(data_2004$Date)
#table(data_2004$`Site Name`)
#table(data_2004$CBSA_NAME)
#table(data_2004$STATE)
table(data_2004$COUNTY)
##
## Alameda Butte Calaveras Colusa Contra Costa
## 458 444 61 201 259
## Del Norte El Dorado Fresno Humboldt Imperial
## 117 128 811 58 366
## Inyo Kern Kings Lake Los Angeles
## 247 1304 84 60 2243
## Marin Mariposa Mendocino Merced Mono
## 120 320 61 90 181
## Monterey Nevada Orange Placer Plumas
## 60 232 529 246 258
## Riverside Sacramento San Benito San Bernardino San Diego
## 1203 1453 122 893 1431
## San Francisco San Joaquin San Luis Obispo San Mateo Santa Barbara
## 560 122 176 354 225
## Santa Clara Santa Cruz Shasta Siskiyou Solano
## 660 56 181 117 361
## Sonoma Stanislaus Sutter Trinity Tulare
## 92 183 309 115 627
## Ventura Yolo
## 580 475
#table(data_2019$Date)
#table(data_2019$`Site Name`)
#table(data_2019$CBSA_NAME)
#table(data_2019$STATE)
table(data_2019$COUNTY)
##
## Alameda Butte Calaveras Colusa Contra Costa
## 2137 1254 338 728 811
## Del Norte El Dorado Fresno Glenn Humboldt
## 309 177 2671 356 107
## Imperial Inyo Kern Kings Lake
## 1006 902 2084 805 60
## Los Angeles Madera Marin Mariposa Mendocino
## 4918 357 457 536 713
## Merced Mono Monterey Napa Nevada
## 479 890 1126 355 526
## Orange Placer Plumas Riverside Sacramento
## 883 1685 530 4368 2134
## San Benito San Bernardino San Diego San Francisco San Joaquin
## 466 2617 2155 359 1307
## San Luis Obispo San Mateo Santa Barbara Santa Clara Santa Cruz
## 1419 344 1518 1200 697
## Shasta Siskiyou Solano Sonoma Stanislaus
## 146 458 716 343 798
## Sutter Tehama Trinity Tulare Ventura
## 708 353 345 975 2125
## Yolo
## 405
summary(data_2004$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.10 6.00 10.10 13.13 16.30 251.00
#summary(data_2004$CBSA_CODE) #1253 NA's
#summary(data_2004$COUNTY_CODE)
#summary(data_2004$SITE_LATITUDE)
#summary(data_2004$SITE_LONGITUDE)
summary(data_2019$`Daily Mean PM2.5 Concentration`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.20 4.00 6.50 7.74 9.90 120.90
#summary(data_2019$CBSA_CODE) #4181 NA's
#summary(data_2019$COUNTY_CODE)
#summary(data_2019$SITE_LATITUDE)
#summary(data_2019$SITE_LONGITUDE)
There are 1253 NAs in CBSA_CODE of data_2004 and 4181 NAs in CBSA_CODE of data_2019. However, it is strange to have negative values of PM2.5.
data_2004$Year <- "2004"
data_2019$Year <- "2019"
data_2004_2019 <- rbind(data_2004, data_2019)
colnames(data_2004_2019)[5] <- "pm2.5"
colnames(data_2004_2019)[19] <- "lat"
colnames(data_2004_2019)[20] <- "lon"
colnames(data_2004)[5] <- "pm2.5"
colnames(data_2004)[19] <- "lat"
colnames(data_2004)[20] <- "lon"
colnames(data_2019)[5] <- "pm2.5"
colnames(data_2019)[19] <- "lat"
colnames(data_2019)[20] <- "lon"
leaflet() %>%
# The looks of the Map
addProviderTiles('CartoDB.Positron') %>%
# Some circles
addCircles(
data = unique(data_2004_2019[Year == "2004"]),
lat = ~lat, lng=~lon,
color = "blue",
opacity = 0.1, fillOpacity = 1, radius = 500
) %>%
addCircles(
data = unique(data_2004_2019[Year == "2019"]),
lat = ~lat, lng=~lon,
color = "red",
opacity = 0.1, fillOpacity = 1, radius = 100
)
leaflet() %>%
addProviderTiles('OpenStreetMap') %>%
addCircles(data = unique(data_2004_2019[Year =="2004"]),
lat=~lat,lng=~lon,
color = "blue",
opacity = 0.1, fillOpacity = 1, radius = 500) %>%
addCircles(data = unique(data_2004_2019[Year =="2019"]),
lat=~lat,lng=~lon,
color = "red",
opacity = 0.1, fillOpacity = 1, radius = 100)
It seems that there are more monitoring sites in the bit cities like San Jose, LA, San Francisco and San Diego.
summary(data_2004_2019$pm2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.200 4.400 7.200 9.172 11.300 251.000
nrow(data_2004_2019[pm2.5 <0])
## [1] 283
There is no missing values of PM2.5 in the combined dataset. However, it is implausible to have negative value of PM2.5. Therefore, I’m going to replace negative values into NA.
data_2004_2019[data_2004_2019$pm2.5 <0] <- NA
summary(data_2004_2019$pm2.5)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 4.400 7.300 9.212 11.300 251.000 283
length(unique(data_2004_2019[Year ==2004]$`Site ID`))
## [1] 106
length(unique(data_2004_2019[Year ==2019]$`Site ID`))
## [1] 160
mean(data_2004_2019[Year ==2004]$pm2.5)
## [1] 13.13107
mean(data_2004_2019[Year ==2019]$pm2.5)
## [1] 7.785987
Monitoring sites were getting more and more. In 2019, the total mean pm2.5 level was lower than in 2004.
First, I create 2004 and 2019 datasets with Date, STATE, COUNTY, Site Name, pm2.5, lat, and lon by Site ID.
data_2004_avg <- data_2004[, .(
Date = Date,
STATE = STATE,
COUNTY = COUNTY,
`Site Name` =`Site Name`,
pm2.5 = mean(pm2.5, na.rm = T),
lat = mean(lat, na.rm = T),
lon = mean(lon, na.rm = T)
), by="Site ID"]
data_2004_avg$year <- "2004"
data_2019_avg <- data_2019[, .(
Date = Date,
STATE = STATE,
COUNTY = COUNTY,
`Site Name` =`Site Name`,
pm2.5 = mean(pm2.5, na.rm = T),
lat = mean(lat, na.rm = T),
lon = mean(lon, na.rm = T)
), by="Site ID"]
data_2019_avg$year <- "2019"
Second, I merged two datasets.
data_0419 <- rbind(data_2004_avg, data_2019_avg)
Add one column of region.
data_0419[, region := fifelse(lon >= -98 & lat > 39.71, "NE",
fifelse(lon < -98 & lat > 39.71, "NW",
fifelse(lon < -98 & lat <= 39.71, "SW","SE")))
]
table(data_0419$region)
##
## NW SW
## 4110 68279
Add on column of pm_health (pm2.5 <= 12 is labeled as health).
data_0419[, pm_health := fifelse(pm2.5 <= 12, "healthy",
fifelse(pm2.5 > 12 & pm2.5 < 35, "normal", "unhealthy"))]
table(data_0419$pm_health)
##
## healthy normal
## 56740 15649
data_0419[STATE == "California"] %>%
ggplot() +
geom_boxplot(mapping = aes(x = year, y = pm2.5, color=pm_health, fill = pm_health)) +
facet_wrap(~ region, nrow = 1)
In California, southwestern region pm2.5 level is higher than northwestern region.
data_0419[COUNTY == "Los Angeles"] %>%
ggplot() +
geom_boxplot(mapping = aes(x = year, y = pm2.5, color=pm_health, fill = pm_health)) +
facet_wrap(~ region, nrow = 1)
In Los Angeles, pm2.5 level is higher in 2004 than in 2019.
data_0419[`Site Name` == "Los Angeles-North Main Street"] %>%
ggplot() +
geom_boxplot(mapping = aes(x = year, y = pm2.5, color=pm_health, fill = pm_health)) +
facet_wrap(~ region, nrow = 1)
In the Los Angeles-North Main Street, pm2.5 level turns into healthy in 2019.
ggplot(data = data_0419[STATE == "California"], aes(x=year, y=pm2.5, color=pm_health, fill = pm_health)) +
geom_violin() +
geom_point() +
facet_wrap(~ region, nrow = 1)
California pm2.5 level is lower in 2019 than in 2004 in both northwestern and southwestern.
ggplot(data = data_0419[COUNTY == "Los Angeles"], aes(x=year, y=pm2.5, color=pm_health, fill = pm_health)) +
geom_violin() +
geom_point() +
facet_wrap(~ region, nrow = 1)
In Los Angeles, most pm2.5 levels is healthier in 2019 than in 2004.
ggplot(data = data_0419[`Site Name` == "Los Angeles-North Main Street"], aes(x=year, y=pm2.5, color=pm_health, fill = pm_health)) +
geom_violin() +
geom_point()
On main street in LA, its pm2.5 level used to be higher in 2004 but it’s within the healthy range in 2019.
ggplot(data = data_0419[STATE == "California"], aes(x=pm2.5, color=year, fill = year)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
facet_wrap(~ region, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In this histogram, there are more data in southwestern region than in northwestern region.
ggplot(data = data_0419[COUNTY == "Los Angeles"], aes(x=pm2.5, color=year, fill = year)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
facet_wrap(~ region, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In Los Angeles, there are more counts in the range of 15 to 20 in 2004 than in 2019.
ggplot(data = data_0419[`Site Name` == "Los Angeles-North Main Street"], aes(x=pm2.5, color=year, fill = year)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
facet_wrap(~ region, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
On the North Main Street, its used to get pm2.5 levels around 20 in 2004. However, it is lower in 2019.
From above data and plots, I would say that the daily concentrations of p2.5 have decreased in California over the last 15 years.